{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save Markdown text into Vector DB"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-1: Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from my_config import MY_CONFIG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-2: Read Markdown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "\n",
    "pattern = os.path.join(MY_CONFIG.OUTPUT_DIR_MARKDOWN, '*.md')\n",
    "md_file_count = len(glob.glob(pattern, recursive=True)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 68 documents from 20 files\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core import SimpleDirectoryReader\n",
    "\n",
    "reader = SimpleDirectoryReader(input_dir=MY_CONFIG.OUTPUT_DIR_MARKDOWN, recursive=True )\n",
    "documents = reader.load_data()\n",
    "\n",
    "print (f\"Loaded {len(documents)} documents from {md_file_count} files\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doc ID: e7d8616a-3231-4498-9b31-023cf66485cc\n",
      "Text: Building the open future of AI  We are technology developers,\n",
      "researchers, industry leaders and advocates who collaborate to advance\n",
      "safe, responsible AI rooted in open innovation.  !Conference Speaker\n",
      "!Skills & Education\n"
     ]
    }
   ],
   "source": [
    "## Inspect a sample doc\n",
    "print (documents[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-3: Create Chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created 87 chunks from 68 documents\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core import Document\n",
    "from llama_index.core.node_parser import SentenceSplitter\n",
    "\n",
    "parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)\n",
    "nodes = parser.get_nodes_from_documents(documents)\n",
    "print(f\"Created {len(nodes)} chunks from {len(documents)} documents\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-4: Setup Embedding Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
    "import os\n",
    "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
    "from llama_index.core import Settings\n",
    "\n",
    "Settings.embed_model = HuggingFaceEmbedding(\n",
    "    model_name = MY_CONFIG.EMBEDDING_MODEL\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-5: Connect to Milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected to Milvus instance:  ./rag_html.db\n",
      "✅ Cleared collection : docs\n"
     ]
    }
   ],
   "source": [
    "## Clear up any old data\n",
    "\n",
    "from pymilvus import MilvusClient\n",
    "\n",
    "milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n",
    "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n",
    "\n",
    "# if we already have a collection, clear it first\n",
    "if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n",
    "    milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n",
    "    print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected Llama-index to Milvus instance:  ./rag_html.db\n"
     ]
    }
   ],
   "source": [
    "# connect llama-index to vector db\n",
    "from llama_index.core import VectorStoreIndex, StorageContext\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
    "\n",
    "vector_store = MilvusVectorStore(\n",
    "    uri = MY_CONFIG.DB_URI ,\n",
    "    dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
    "    collection_name = MY_CONFIG.COLLECTION_NAME,\n",
    "    overwrite=True\n",
    ")\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "\n",
    "print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-6: Save to DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%time\n",
    "\n",
    "## We save entire md documents into vector store\n",
    "\n",
    "# from llama_index.core import VectorStoreIndex\n",
    "\n",
    "# index = VectorStoreIndex.from_documents(\n",
    "#     documents, storage_context=storage_context\n",
    "# )\n",
    "# print (f\"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}\" )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully stored 87 chunks in Milvus collection 'docs'\n",
      "CPU times: user 329 ms, sys: 119 ms, total: 448 ms\n",
      "Wall time: 433 ms\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "\n",
    "# save chunks into vector db\n",
    "\n",
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "index = VectorStoreIndex(\n",
    "        nodes=nodes,\n",
    "        storage_context=storage_context,\n",
    "    )\n",
    "\n",
    "print(f\"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dpk-rag-html-1-r1.1.0",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
